In [1]:
# Ucitavanje dataset-a za klasifikaciju sentimenta
import pandas as pd
data = pd.read_csv("SerbMR-2C.csv")
# Prikazuje prvih pet linija skupa podataka
data.head()

Unnamed: 0,Text,class-att
0,Braća Koen (Coen brothers) iako poznati po tri...,POSITIVE
1,Često upadam u veliku raspravu kada se izjasni...,POSITIVE
2,“Ulični Psi” je jedan od onih filmova koji sve...,POSITIVE
3,Užasno potcenjen film! Predvođen dvojicom najp...,POSITIVE
4,"Naprosto sam oduševljen ovim projektom, ovo je...",POSITIVE


In [3]:
# Obucavanje multinomijalnog naivnog Bajesovog klasifikatora na svim podacima
corpus = data['Text']
y = data['class-att']
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
clf.fit(X, y)

# Obucavanje i evaluacija multinomijalnog naivnog Bajesovog klasifikatora pomocu jednostruke train/test podele
from sklearn.model_selection import train_test_split
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=0)
X_train = vectorizer.fit_transform(corpus_train)
clf.fit(X_train, y_train)

X_test = vectorizer.transform(corpus_test)
y_pred = clf.predict(X_test)
print("Accuracy: ", ((y_test == y_pred).sum() / X_test.shape[0]))

Accuracy:  0.8011869436201781


In [3]:
# Obucavanje i evaluacija multinomijalnog naivnog Bajesovog klasifikatora pomocu desetoslojne unakrsne validacije
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
p_clf = Pipeline([('vectorizer', CountVectorizer()), ('classifier', MultinomialNB())])
acc = cross_val_score(p_clf, corpus, y, cv=10)
print(acc)
print("Accuracy: ", acc.mean())

[0.88757396 0.81065089 0.82738095 0.67261905 0.7202381  0.72619048
 0.69047619 0.63690476 0.8452381  0.75      ]
Accuracy:  0.7567272471118625


In [4]:
# Obucavanje i evaluacija logisticke regresije pomocu desetoslojne unakrsne validacije, bez optimizacije hiperparametara
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear') # liblinear koristi OVR strategiju
p_clf = Pipeline([('vectorizer', CountVectorizer()), ('classifier', clf)])
acc = cross_val_score(p_clf, corpus, y, cv=10, scoring='accuracy')
print(acc)
print("Accuracy: ", acc.mean())
f1 = cross_val_score(p_clf, corpus, y, cv=10, scoring='f1_macro')
print(f1)
print("F-measure: ", f1.mean())

[0.8816568  0.85207101 0.71428571 0.74404762 0.69642857 0.79166667
 0.68452381 0.69642857 0.72619048 0.78571429]
Accuracy:  0.7573013524936602
[0.88132022 0.85188431 0.70930065 0.74360248 0.69641782 0.78951137
 0.68424412 0.69615944 0.72556818 0.78568391]
F-measure:  0.7563692505355197


In [5]:
# Obucavanje i evaluacija logisticke regresije pomocu desetoslojne unakrsne validacije, bez optimizacije hiperparametara
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
clf = LogisticRegression(solver='liblinear') # liblinear koristi OVR strategiju
p_clf = Pipeline([('vectorizer', TfidfVectorizer(sublinear_tf=True, use_idf=True)), ('classifier', clf)])
acc = cross_val_score(p_clf, corpus, y, cv=10, scoring='accuracy')
print(acc)
print("Accuracy: ", acc.mean())

[0.92307692 0.88757396 0.77380952 0.73214286 0.70833333 0.79761905
 0.74404762 0.66666667 0.8452381  0.80952381]
Accuracy:  0.7888031839954917


In [6]:
# Obucavanje i evaluacija logisticke regresije pomocu desetoslojne unakrsne validacije, sa optimizacijom hiperparametara
from sklearn.model_selection import GridSearchCV, StratifiedKFold
clf = LogisticRegression(solver='liblinear')
p_grid_lr = {'classifier__C': [0.1, 1.0, 10]}
p_clf = Pipeline([('vectorizer', CountVectorizer()), ('classifier', clf)])
gs_clf = GridSearchCV(estimator=p_clf, param_grid=p_grid_lr, cv=2, scoring='accuracy')
acc = cross_val_score(gs_clf, corpus, y, cv=5, scoring='accuracy')
print(acc)
print("Optimized Accuracy: ", acc.mean())

# Bolja opcija - rucno zadati stratifikovanu CV podelu tako da podaci budu promesani
gs_clf = GridSearchCV(estimator=p_clf, param_grid=p_grid_lr, cv=StratifiedKFold(shuffle=True, n_splits=2), scoring='accuracy')
acc = cross_val_score(gs_clf, corpus, y, cv=StratifiedKFold(shuffle=True, n_splits=5), scoring='accuracy')
print(acc)
print("Optimized Accuracy: ", acc.mean())

[0.84866469 0.7388724  0.76785714 0.68154762 0.75595238]
Optimized Accuracy:  0.7585788469690546
[0.77151335 0.77151335 0.77380952 0.72916667 0.76785714]
Optimized Accuracy:  0.7627720079129574


In [7]:
# Obucavanje i evaluacija SVM pomocu desetoslojne unakrsne validacije, sa i bez optimizacije hiperparametara
from sklearn.svm import LinearSVC
# SVM bez kernela, L2 regularizacija, L2 funkcija gubitka, resavanje u primalnom domenu
clf = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, max_iter=100000) 
p_grid_svm = {'classifier__C': [0.1, 1.0, 10]}
p_clf = Pipeline([('vectorizer', CountVectorizer()), ('classifier', clf)])
gs_clf = GridSearchCV(estimator=p_clf, param_grid=p_grid_svm, cv=2, scoring='accuracy')
acc = cross_val_score(p_clf, corpus, y, cv=5, scoring='accuracy')
print(acc)
print("Accuracy: ", acc.mean())
acc = cross_val_score(gs_clf, corpus, y, cv=5, scoring='accuracy')
print(acc)
print("Optimized Accuracy: ", acc.mean())

[0.82492582 0.72997033 0.77678571 0.69345238 0.75      ]
Accuracy:  0.755026847534266
[0.82492582 0.73293769 0.77678571 0.69345238 0.75      ]
Optimized Accuracy:  0.7556203193443549
